library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5     v purrr   0.3.4
## v tibble  3.1.6     v dplyr   1.0.7
## v tidyr   1.1.4     v stringr 1.4.0
## v readr   2.1.1     v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

In this problem, we will explore the Google Play Store Apps Dataset, scraped from the Google Play Store. Each row in the table is an app. Here are descriptions of some of the columns.

  1. Category Category the app belongs to
  2. Rating Overall user rating of the app (as when scraped)
  3. Reviews Number of user reviews for the app (as when scraped)
  4. Installs Number of user downloads/installs for the app (as when scraped)

Notice that we did not set the index, even though it looks like the app name would be a good candidate. This is because the app names are not unique! We can verify this with the help of a numpy function called np.unique. This function takes in an array and outputs an array with all duplicates removed. We see above that the table has 10,825 rows. This means that there are 10,825 apps in the dataset. But let’s calculate how many unique names there are: len(np.unique(apps.get(‘App’))) So there must be duplicates! Because there are duplicates, the app name is not a good row label. In fact, there is no good row label in this case. In situations like this, we’ll leave the index as-is.

df= read.csv("F://R Scripts//Data//googleplaystore.csv")
df

Question 1.

Assign by_content to a dataframe which counts the number of apps per Content Rating. List the Content Ratings alphabetically. Hint:You can use groupby

by_content = df %>% group_by(Content.Rating) %>% summarise(count=n()) %>% arrange(Content.Rating)
by_content

Question 2.

Compute the number of apps in the dataset that have 1.5 stars or lower, and save the result in bad_apps and compute the number of apps in the dataset that have 4 stars or higher, and save the results in good_apps.

bad_apps=df %>%  filter(Rating<=1.5)
bad_apps
cat("Count of bad apps is :",nrow(bad_apps))
## Count of bad apps is : 23
unique(bad_apps$App)
##  [1] "House party - live chat"                         
##  [2] "Speech Therapy: F"                               
##  [3] "Clarksburg AH"                                   
##  [4] "Truck Driving Test Class 3 BC"                   
##  [5] "BD Provider App"                                 
##  [6] "BJ Bridge Standard American 2018"                
##  [7] "MbH BM"                                          
##  [8] "CB Mobile Access"                                
##  [9] "CB Mobile Biz"                                   
## [10] "CF Climb"                                        
## [11] "Thistletown CI"                                  
## [12] "CJ DVD Rentals"                                  
## [13] "Hercules CP Mobile"                              
## [14] "CR Magazine"                                     
## [15] "Tech CU Card Manager"                            
## [16] "Quiz DC"                                         
## [17] "DS Creator 2.0"                                  
## [18] "DT future1 cam"                                  
## [19] "EY TaxChat"                                      
## [20] "FE Mechanical Engineering Prep"                  
## [21] "Familial Hypercholesterolaemia Handbook"         
## [22] "FK Atlantas"                                     
## [23] "Lottery Ticket Checker - Florida Results & Lotto"
good_apps=df %>%  filter(Rating>=4)
good_apps
cat("Count of Good apps is :",nrow(good_apps))
## Count of Good apps is : 7369

Question 3.

How many apps listed have the word “Google” (with that exact capitalization) in the App name? Save the number as google_apps. Note: each row is a separate App and should be counted as such (even if the names are the same)

google_apps_df = df %>% filter(grepl("google|Google",App))
google_apps_df
google_apps= nrow(google_apps_df)
cat("Number of Google Apps is :",google_apps)
## Number of Google Apps is : 94

Question 4.

Of the apps under the Category COMMUNICATION, get the Content Rating count for apps where the Rating is between or equal to 4 and 5 (inclusive). Return this as a table sorted by count (with greatest count at the top of the table) called top_communication_apps. Hint: creating two Dataframes may help. Even though it is possible to do this in one line, sometimes it’s better to break up the tasks to make the overall process clearer. We will only check the final table though.

top_communication_apps = df %>% filter(Category=="COMMUNICATION" & Rating>=4 & Rating<=5) %>% group_by(Content.Rating) %>% summarise(Count=n()) %>% arrange(desc(Count))
top_communication_apps

###Question 5. Create a table named install_stats which has a single column, Installs, which contains the mean number of install of apps in each type of content rating.

df$Installs =gsub(",","",df$Installs)
df$Installs =as.numeric(gsub("[+]","",df$Installs))
## Warning: NAs introduced by coercion
str(df$Installs)
##  num [1:10841] 1e+04 5e+05 5e+06 5e+07 1e+05 5e+04 5e+04 1e+06 1e+06 1e+04 ...
install_stats = df %>% group_by(Content.Rating) %>% summarise(mean_installs=mean(Installs))
install_stats

Question 6.

Suppose a good app is one with at least one million installs and a rating of at least 4.0. Create a variable called best_category containing the name of the category with the best apps.

best_category_df = df %>% filter(Rating>=4 & Installs >=1000000) %>% arrange(Installs)
best_category_df 
best_category = unique(best_category_df$Category)
best_category
##  [1] "ART_AND_DESIGN"      "AUTO_AND_VEHICLES"   "BEAUTY"             
##  [4] "BOOKS_AND_REFERENCE" "BUSINESS"            "COMICS"             
##  [7] "COMMUNICATION"       "DATING"              "EDUCATION"          
## [10] "ENTERTAINMENT"       "EVENTS"              "FINANCE"            
## [13] "FOOD_AND_DRINK"      "HEALTH_AND_FITNESS"  "HOUSE_AND_HOME"     
## [16] "LIBRARIES_AND_DEMO"  "LIFESTYLE"           "GAME"               
## [19] "FAMILY"              "MEDICAL"             "SOCIAL"             
## [22] "SHOPPING"            "PHOTOGRAPHY"         "SPORTS"             
## [25] "TRAVEL_AND_LOCAL"    "TOOLS"               "PERSONALIZATION"    
## [28] "PRODUCTIVITY"        "PARENTING"           "WEATHER"            
## [31] "VIDEO_PLAYERS"       "NEWS_AND_MAGAZINES"  "MAPS_AND_NAVIGATION"

Question 7.

Make a bar chart of the average rating by category in which the bars are sorted from smallest to largest

data= df %>% group_by(Category) %>% summarise(average_rating=mean(Rating,na.rm=TRUE)) %>% arrange(average_rating)
data
ggplot(data,aes(x=reorder(Category,average_rating),y=average_rating,color=Category)) + geom_bar(stat="identity")